import re
import time

import requests
from pymysql import connect
# 创建excel表格对象
# from openpyxl import workbook
# wb = workbook.Workbook()
# ws = wb.active  # 激活Excel表格
# ws.append(['电影链接', '图片链接', '电影名称','其它信息'])

class Spider():
    def __init__(self):
        self.url='https://movie.douban.com/top250?start=0&filter='
        self.headers={
            #'Referer':'https: // movie.douban.com / chart',
            #'Cookie':'bid=gVuAVuJnpV8; douban-fav-remind=1; __gads=ID=0c0589db7bc02989-222319acd0d8005b:T=1670679624:RT=1690191016:S=ALNI_MaybNbU2ibuss_Yoadp3uJifodJ9w; __gpi=UID=00000b8d4e13da77:T=1670679624:RT=1690191016:S=ALNI_MbrJ5USO84EwVeXhf3XCxTdm_UKMw; __utma=30149280.598903103.1670679629.1690550203.1692358726.7; __utmz=30149280.1679753998.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ll="118185"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1692358702%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DTUgCMzOnL6aSoR-lv8FmM…000000664df5815%22%5D; _pk_id.100001.4cf6=a0aa6bb646987768.1679753997.; __utma=223695111.473136323.1679753998.1679753998.1692358726.2; __utmz=223695111.1679753998.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __yadk_uid=WnhD8UcTmMqr5dtll8AdTC1fUINd4HeC; _vwo_uuid_v2=D284D0545B8F29E9F6723C86D9E37A2D9|ab3c2e4fd4d484201803599faf47d899; viewed="36168842_36319143_36291673"; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utmb=30149280.0.10.1692358726; __utmc=30149280; __utmb=223695111.0.10.1692358726; __utmc=223695111',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.8810.3391 Safari/537.36 Edge/18.14383',
            'Content-Type':'text/html; charset=utf-8'

        }
        # 获取连接对象
        self.connect = connect(
            user='root',
            password='123456',
            host='127.0.0.1',
            port=3306,
            database='movie',
            charset='utf8'
        )
        # 获取游标对象
        self.cs = self.connect.cursor()
        try:
            create_sqli = "create table textben (id int, title varchar(255),title_link varchar(255),jpg_link varchar(255),content longtext);"
            self.cs.execute(create_sqli)
        except Exception as e:
            print("创建数据表失败:", e)
        else:
            print("创建数据表成功;")

    def get_data(self):
        response=requests.get(url=self.url,headers=self.headers)
        datas=re.findall('<div\sclass="item">.*?<a\shref="(.*?)">.*?<img\swidth="100"\salt=".*?"\ssrc="(.*?)"\sclass="">.*?<span\sclass="title">(.*?)</span>.*?<p\sclass="">(.*?)</p>',response.text,re.S)
        for data in datas:
            #print(data)
            movie_link=data[0]
            jpg_link=data[1]
            movie_title=data[2]
            info=data[3]
            info=str(info)
            info=info.replace('&nbsp;&nbsp;&nbsp;','')
            info = info.replace('&nbsp;/&nbsp;', '')
            info = info.replace('...<br>', '')
            info=info.replace('  ','')
            self.save_data(movie_title,movie_link,jpg_link,info)
            print('电影链接:',movie_link)
            print('图片链接:',jpg_link)
            print('电影名称:',movie_title)
            print('其它信息:',info.strip())
            print('==========================================================')
    def save_data(self,title,title_link, jpg_link,content):
        """将数据保存到数据库中"""
        sql = 'insert into textben(title,title_link, jpg_link,content) values(%s, %s, %s, %s)'
        self.cs.execute(sql, [title,title_link, jpg_link,content])
        self.connect.commit()
        print('======================爬取完成===================')



if __name__=='__main__':
    s=Spider()
    for i in range(0,250,25):
        print('-------------正在爬取排名{}-{}的电影-----------------'.format(i,i+25))
        time.sleep(2)
        s.url='https://movie.douban.com/top250?start={}&filter='.format(i)
        s.get_data()